In [2]:
# http://nbviewer.ipython.org/github/jming/cs109/blob/master/lec_10_cross_val.ipynb
In [ ]:
%matplotlib inline
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.cross_validation import cross_val_score, KFold
from sklearn.linear_model import LinearRegression
In [5]:
def hidden_model(x):
# y is a linear combination of columns 5 and 10...
result = x[:, 5] + x[:, 10]
# ... with a little noise
# loc is mean, scale is standard deviation
result += np.random.normal(loc=0, scale=.005, size=result.shape)
return result
def make_x(nobs):
return np.random.uniform(low=0, high=3, size=(nobs, 10**6))
x = make_x(20)
y = hidden_model(x)
print x.shape
In [7]:
selector = SelectKBest(f_regression, k=2).fit(x, y)
best_features = np.where(selector.get_support())[0]
print best_features
In [11]:
# get_support returns boolean array with KBest features as True, rest False
selector.get_support()[0:10]
Out[11]:
In [13]:
# when given only one array, returns indices of non-zero(True) values
np.where(selector.get_support())
Out[13]:
In [18]:
for b in best_features:
plt.plot(x[:, b], y, 'o')
plt.title('Column %i' % b)
plt.xlabel('X')
plt.ylabel('Y')
plt.show()
In [19]:
xt = x[:, best_features]
clf = LinearRegression().fit(xt, y)
print "Score(R^2) is :", clf.score(xt, y)
In [20]:
y_pred = clf.predict(xt)
plt.plot(y_pred, y, 'o')
plt.plot(y, y, 'r-')
plt.xlabel('Predicted')
plt.ylabel('Observed')
Out[20]:
In [21]:
cross_val_score(clf, xt, y, cv=5).mean()
Out[21]:
In [26]:
for train, test in KFold(n=len(y), n_folds=10):
xtrain, xtest, ytrain, ytest = xt[train], xt[test], y[train], y[test]
clf.fit(xtrain, ytrain)
y_pred = clf.predict(xtest)
plt.plot(y_pred, ytest, 'o')
plt.plot(ytest, ytest, 'r-')
plt.xlabel('Predicted')
plt.ylabel('Observed')
Out[26]:
In [27]:
x2 = make_x(100)
y2 = hidden_model(x2)
x2 = x2[:, best_features]
y2_pred = clf.predict(x2)
plt.plot(y2_pred, y2, 'o')
plt.plot(y2, y2, 'r-')
plt.xlabel('Predicted')
plt.ylabel('Observed')
Out[27]:
Our model fails! There's not really any correlation because we used the entire dataset to select our K-best features. Because of this, cross-validation did not detect the overfitting.
To combat overfitting, we cannot peak into the cross-validation set at all. To accomplish this, we need to find the k-best features in each cross-validation iteration.
In [30]:
scores = []
best_features_list = []
for train, test in KFold(len(y), n_folds=5):
xtrain, xtest, ytrain, ytest = x[train], x[test], y[train], y[test]
b = SelectKBest(score_func=f_regression, k=2)
b.fit(xtrain, ytrain)
xtrain = xtrain[:, b.get_support()]
xtest = xtest[:, b.get_support()]
# Save the best features selected each time.
best_features_list.append(np.where(b.get_support())[0])
clf.fit(xtrain, ytrain)
scores.append(clf.score(xtest, ytest))
y_pred = clf.predict(xtest)
plt.plot(y_pred, ytest, 'o')
plt.plot(ytest, ytest, 'r-')
plt.xlabel('Predicted')
plt.ylabel('Observed')
print "CV Score (R^2) is:", np.mean(scores)
In [ ]:
In [34]:
import pprint
pprint.pprint(best_features_list)
The same K-best features never show up twice between any of the 5 folds! For now, we have properly found detected overfitting by finding that our predictions were pretty awful for the holdout fold in each iteration.
The abysmal average R^2 shows this as well. We also were never able to detect the true correlated columns (5 and 10.) This may not be possible with only 20 total datapoints to work with, but at least we can know when our model isn't generalizing to new out of sample data.
In [ ]: